{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__title__      = ML in Action Chapter5 Code\n",
    "__author__     = wgj\n",
    "__createDate__ = 2018-10-16 19:39:25"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import *\n",
    "import random\n",
    "\n",
    "def sigmoid(inX):\n",
    "    return 1.0/(1+exp(-inX))\n",
    "\n",
    "\"\"\" 梯度上升法 \"\"\"\n",
    "def gradAscent(dataMatIn, classLabels):\n",
    "    \"\"\"\n",
    "    输入：1、dataMatIn——输入数据\n",
    "         2、classLabels——类别标签\n",
    "    输出：weights——回归系数\n",
    "    \"\"\"\n",
    "    dataMatrix = mat(dataMatIn)             # 转换为NumPy矩阵类型\n",
    "    labelMat = mat(classLabels).transpose() # 转换为NumPy矩阵类型并转置为列向量\n",
    "    m,n = shape(dataMatrix)                 # 输入数据的行、列数\n",
    "    alpha = 0.001                           # 初始化步进因子\n",
    "    maxCycles = 500                         # 初始化迭代次数\n",
    "    weights = ones((n,1))                   # 初始化回归系数，全1矩阵，n行1列\n",
    "    for k in range(maxCycles):\n",
    "        h = sigmoid(dataMatrix*weights)     # 计算对数几率，m行1列（括号内为矩阵乘）\n",
    "        error = (labelMat - h)              # 类别差，m行1列\n",
    "        weights = weights + alpha * dataMatrix.transpose()* error # 更新回归系数\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "“梯度上升法”的不足在于每次更新回归系数时都要遍历整个数据集，计算复杂度高\n",
    "因此，产生了“随机梯度上升法”，即每次仅使用1个样本点数据来更新回归系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\" 随机梯度上升法 \"\"\"\n",
    "def stocGradAscent0(dataMatrix, classLabels):\n",
    "    \"\"\"\n",
    "    输入：1、dataMatIn——输入数据\n",
    "         2、classLabels——类别标签\n",
    "    输出：weights——回归系数\n",
    "    \"\"\"\n",
    "    m,n = shape(dataMatrix)\n",
    "    alpha = 0.01\n",
    "    weights = ones(n)   # 1行n列\n",
    "    for i in range(m):  # 迭代次数等同于样本集数量\n",
    "        h = sigmoid(sum(dataMatrix[i]*weights))  # 输入数据为array，此处为元素乘\n",
    "        error = classLabels[i] - h\n",
    "        weights = weights + alpha * error * dataMatrix[i]  # dataMatrix[i]为第i行数据，共n个数\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "“随机梯度上升法”的不足在于回归系数存在周期震荡波动\n",
    "因此，产生了“改进的随机梯度上升法”，改进之处有两点：\n",
    "1、动态调整步进因子；\n",
    "2、随机选取样本来更新回归系数。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\"\"\" 改进的随机梯度上升法 \"\"\"\n",
    "def stocGradAscent1(dataMatrix, classLabels, numIter=150):\n",
    "    \"\"\"\n",
    "    输入：1、dataMatIn——输入数据\n",
    "         2、classLabels——类别标签\n",
    "         3、numIter——迭代次数（可选值，默认为150）\n",
    "    输出：weights——回归系数\n",
    "    \"\"\"\n",
    "    m,n = shape(dataMatrix)\n",
    "    weights = ones(n)\n",
    "    for j in range(numIter):\n",
    "        for i in range(m):\n",
    "            alpha = 4/(1.0+j+i)+0.0001    # alpha随着迭代递减（有常数存在，不会为0）\n",
    "            randIndex = int(random.uniform(0,m))\n",
    "            h = sigmoid(sum(dataMatrix[randIndex]*weights))\n",
    "            error = classLabels[randIndex] - h\n",
    "            weights = weights + alpha * error * dataMatrix[randIndex]\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\" 回归分类 \"\"\"\n",
    "def classifyVector(inX, weights):\n",
    "    \"\"\"\n",
    "    输入：1、inX——待预测数据\n",
    "         2、weights——回归系数\n",
    "    输出：预测类别（0、1）\n",
    "    \"\"\"\n",
    "    prob = sigmoid(sum(inX*weights))\n",
    "    if prob > 0.5: return 1.0\n",
    "    else: return 0.0\n",
    "\n",
    "\"\"\" 从疝气病马预测死亡率 \"\"\"\n",
    "def colicTest():\n",
    "    \"\"\"\n",
    "    输入：无\n",
    "    输出：errorRate——错误率\n",
    "    \"\"\"\n",
    "    frTrain = open('horseColicTraining.txt');  # 打开训练文件\n",
    "    frTest = open('horseColicTest.txt')        # 打开测试文件\n",
    "    trainingSet = []; trainingLabels = []      # 初始化训练集与标签\n",
    "    for line in frTrain.readlines():\n",
    "        currLine = line.strip().split('\\t')    # 移除每行首尾空格并用tab分割数据\n",
    "        lineArr =[]           # 保存一行训练数据\n",
    "        for i in range(21):   # 每行22个数，前21个为特征\n",
    "            lineArr.append(float(currLine[i]))  # 将字符串转换成浮点型\n",
    "        trainingSet.append(lineArr)             # 向训练集中添加1行\n",
    "        trainingLabels.append(float(currLine[21]))  # 向标签添加数据\n",
    "    # 使用改进的梯度上升法计算回归系数\n",
    "    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)\n",
    "    errorCount = 0; numTestVec = 0.0    # 初始化错误数和测试样本数\n",
    "    for line in frTest.readlines():\n",
    "        numTestVec += 1.0\n",
    "        currLine = line.strip().split('\\t')\n",
    "        lineArr =[]\n",
    "        for i in range(21):\n",
    "            lineArr.append(float(currLine[i]))\n",
    "        if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):\n",
    "            errorCount += 1\n",
    "    errorRate = (float(errorCount)/numTestVec)\n",
    "    print(\"本次测试的错误率为: %f\" % errorRate)\n",
    "    return errorRate\n",
    "\n",
    "\"\"\" 多次测试 \"\"\"\n",
    "def multiTest():\n",
    "    numTests = 10; errorSum = 0.0\n",
    "    for k in range(numTests):\n",
    "        errorSum += colicTest()\n",
    "    print(\"%d 次迭代后，平均错误率为: %f\" % (numTests,errorSum/float(numTests)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "本次测试的错误率为: 0.298507\n",
      "本次测试的错误率为: 0.313433\n",
      "本次测试的错误率为: 0.328358\n",
      "本次测试的错误率为: 0.283582\n",
      "本次测试的错误率为: 0.313433\n",
      "本次测试的错误率为: 0.313433\n",
      "本次测试的错误率为: 0.298507\n",
      "本次测试的错误率为: 0.402985\n",
      "本次测试的错误率为: 0.298507\n",
      "本次测试的错误率为: 0.328358\n",
      "10 次迭代后，平均错误率为: 0.317910\n"
     ]
    }
   ],
   "source": [
    "multiTest()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
